library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(rstatix)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)
library(limma)
# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
There are many more Male than Female samples, but the proportion is the same for ASD and control (~3:1)
table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Sex = 'Gender')
cro(table_info$Diagnosis, list(table_info$Sex, total()))
| Gender | #Total | |||
|---|---|---|---|---|
| Female | Male | |||
| Diagnosis | ||||
| CTL | 8 | 28 | 36 | |
| ASD | 3 | 10 | 13 | |
| #Total cases | 11 | 38 | 49 | |
There doesn’t seem to be a relation between diagnosis and age
datMeta %>% ggplot(aes(Diagnosis, Age, fill = Diagnosis)) + geom_boxplot() +
geom_jitter(color='gray', size=2, alpha = 0.5) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
theme_minimal() + theme(legend.position = 'none')
The two first principal components separates the samples by diagnosis groups quite well, and all the samples are from the LDPFC, so there’s no need to do the second plot studying brain region patterns
pca = datExpr %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('Subject_ID'= datExpr %>% colnames) %>%
left_join(datMeta, by='Subject_ID') %>%
dplyr::select('Subject_ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Samples coloured by Diagnosis') + coord_fixed() + theme_minimal()
rm(pca)
T-SNE seems to be struggling to separate the samples by Diagnosis
perplexities = c(1,2,5,10)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
left_join(datMeta, by=c('ID'='Subject_ID')) %>%
dplyr::select('C1','C2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.7) + theme_minimal() +
ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}
grid.arrange(grobs=ps, nrow=2)
Since all the samples are from the same brain region and there is only one sample per subject, there’s no need for the other two plots from this section.
The first principal component explains over 99% of the variance in the dataset and is strongly related to the mean level of expression of the genes
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') + coord_fixed() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme(legend.position = 'bottom', legend.key.width=unit(3, 'cm')) + labs(color='Mean Expression ')
Independently of the perplexity parameter, the mean level of expression of the genes seems to continue to be the main feature that characterises the genes
perplexities = c(1,2,5,10,50,100)
ps = list()
for(i in 1:length(perplexities)){
file_name = paste0('./../Data/Results/tsne_perplexity_',perplexities[i],'.csv')
# Check if file with t-sne coordinates already exists, if not, calculate coordinates
if(file.exists(file_name)){
tsne = read.csv(file_name)
datExpr_tsne = datExpr[!duplicated(datExpr),] # Remove duplicates (2 duplicated rows)
} else {
set.seed(123)
datExpr_tsne = datExpr[!duplicated(datExpr),] # Remove duplicates (2 duplicated rows)
tsne = datExpr_tsne %>% Rtsne(perplexity=perplexities[i])
tsne_coords = cbind(tsne$Y, rownames(datExpr_tsne))
colnames(tsne_coords) = c('C1','C2','ID')
write.csv(tsne_coords, file_name, row.names=F)
tsne = tsne_coords
}
# Plot results
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr_tsne))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() + coord_fixed() +
scale_color_viridis() + ggtitle(paste0('Perplexity = ', perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(perplexities, ps, i, file_name, tsne, datExpr_tsne)
Samples belonging to the ASD group have higher levels of expression than the Control group but the difference is not statistically significant (this is the only dataset where the difference is not statistically significant)
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>%
left_join(datMeta, by=c('ID'='Subject_ID'))
plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')
There are only 3 DE genes using a threshold of LFC=0 (~0.02% of the total number of genes), so there’s not that much to study here
#lfc_list = c(seq(0, 0.1, 0.01), seq(0.1,0.24,0.02), seq(0.26, 1.11, 0.05))
lfc_list = seq(0, 0.5, 0.05)
n_genes = nrow(datExpr)
# PCA
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp
# Initialise data frame to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
fit = lmFit(datExpr, design=model.matrix(~SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + SV10 + SV11 +
Diagnosis, data = datMeta))
efit = eBayes(fit)
for(lfc in lfc_list){
# Recalculate DE_info with the new threshold (p-values change) an filter DE genes
DE_genes = topTable(efit, coef = 'DiagnosisASD', sort.by = 'none', n = Inf, lfc = lfc) %>%
dplyr::mutate('ID' = rownames(.)) %>% filter(adj.P.Val < 0.05)
#DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
# mutate('ID'=rownames(.)) %>% filter(padj<0.05)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
pca_samps_old = pca_samps_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
}
# Add Diagnosis information
pcas_samps = pcas_samps %>% left_join(datMeta, by=c('ID'='Subject_ID')) %>%
dplyr::select(ID, PC1, PC2, lfc, Diagnosis)
# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Magnitude') + ylab('DE Genes') +
ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(lfc_list, n_genes, lfc, pca_samps_new, pca_samps_old, datExpr_pca_samps, fit, efit)
library(ggrepel)
ggplotly(genes_info %>% mutate(gene_name = datGenes$hgnc_symbol) %>%
ggplot(aes(shrunken_log2FoldChange, padj, color=significant)) +
geom_point(alpha=0.2, aes(id = gene_name)) + scale_y_sqrt() +
xlab('LFC') + ylab('Adjusted p-value') + theme_minimal() + labs(color = 'DE')) #+
#geom_label_repel(aes(label=ifelse(shrunken_log2FoldChange>0.5 | shrunken_log2FoldChange< -0.39,
# as.character(gene_name),'')), direction = 'y', nudge_y = 0.01)
None of the three DE genes have neuronal annotations
top_genes = genes_info %>% mutate(Gene = datGenes$hgnc_symbol) %>% filter(significant == TRUE) %>%
arrange(-abs(shrunken_log2FoldChange)) %>% #top_n(25, wt=abs(shrunken_log2FoldChange)) %>%
mutate(Neuronal = as.logical(Neuronal), LFC = shrunken_log2FoldChange, n = 1:3)
kable(top_genes %>% dplyr::select(n, Gene, LFC, padj, Neuronal), caption = 'DE genes ordered by LFC magnitude')
| n | Gene | LFC | padj | Neuronal |
|---|---|---|---|---|
| 1 | GPR161 | 0.1430789 | 0.0043334 | FALSE |
| 2 | C1QTNF5 | -0.1019610 | 0.0210336 | FALSE |
| 3 | WDR74 | -0.0408516 | 0.0009733 | FALSE |
PCA plot of genes characterised by each LFC threshold’s corresponding set of DE genes.
The frame corresponding to LFC = -1 doesn’t correspond to any LFC, all of the genes in the dataset were used to create this plot
Very high LFC thresholds are the ones that separate the samples by diagnosis best (around 1) and it even corrects the outlier samples!
The PC values for each LFC were scaled so they were easier to compare (without this scale, as the LFC increases, the points begin to cluster more and more tightly at the centre of the plot)
ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(lfc==-1,-1,lfc)) %>%
ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) + coord_fixed() +
theme_minimal() + ggtitle('PCA plot of samples modifying filtering threshold'))
# # Figure for thesis:
# p1 = pcas_samps %>% filter(lfc == 0) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0') + theme(legend.position = 'none') + coord_fixed()
# p2 = pcas_samps %>% filter(lfc == 0.1) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.1') + theme(legend.position = 'none') + coord_fixed()
# p3 = pcas_samps %>% filter(lfc == 0.2) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.2') + theme(legend.position = 'none') + coord_fixed()
# p4 = pcas_samps %>% filter(lfc == 0.3) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.3') + coord_fixed()
# ggarrange(p1,p2,p3,p4, nrow=1, common.legend = TRUE, legend='bottom', widths = c(0.22, 0.25, 0.28, 0.24))
PCA plot of samples characterised by all of the genes vs characterised only by the genes found to be differentially expressed (this plot is just a side-by-side comparison of two frames from the plot above)
pca = datExpr %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'= datExpr %>% colnames) %>%
left_join(datMeta, by=c('ID'='Subject_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Characterising samples with all genes') +
coord_fixed() + theme_minimal() + theme(legend.position = 'none')
fit = lmFit(datExpr, design=model.matrix(~SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + + SV9 + SV10 + SV11 +
Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_genes = topTable(efit, sort.by = 'none', n = Inf, coef = 'DiagnosisASD', lfc = 0.5) %>%
dplyr::mutate('ID' = rownames(.))
pca = datExpr[rownames(datExpr) %in% DE_genes$ID,] %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'= datExpr %>% colnames) %>%
left_join(datMeta, by=c('ID'='Subject_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Characterising samples with DE genes') +
coord_fixed() + theme_minimal()
ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')
rm(p1, p2, pca, plot_data)
The second principal component is slightly related to the LFC of the genes when using all of the genes, and strongly related when using only the differentially expressed genes. If we were to increase the LFC threshold they would probably separate into two clouds as they do in the other two datasets.
# All genes
pca = datExpr %>% prcomp
plot_data = genes_info %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])
pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of all genes') +
theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')
# Only DE genes
pca = datExpr[genes_info$significant,] %>% prcomp
plot_data = genes_info %>% dplyr::filter(significant == TRUE) %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])
pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of differentially expressed genes') +
theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')
ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')
rm(pca, plot_data, pos_zero, p1, p2)
There aren’t enough DE genes to do this comparison
plot_data = data.frame('MeanExpr' = rowMeans(datExpr),
'Group' = ifelse(genes_info$shrunken_log2FoldChange>0,'overexpressed','underexpressed')) %>%
mutate(Group = factor(Group, levels = c('underexpressed','overexpressed')))
plot_data[genes_info$significant==TRUE,] %>% ggplot(aes(Group, MeanExpr, fill = Group)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
ylab('Mean Expression') + xlab('') + theme(legend.position = 'none') +
theme_minimal() + theme(legend.position = 'none')
## Warning: Computation failed in `stat_compare_means()`:
## not enough 'x' observations
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.8.2 limma_3.40.6
## [3] knitr_1.32 biomaRt_2.40.5
## [5] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [7] DelayedArray_0.10.0 BiocParallel_1.18.1
## [9] matrixStats_0.56.0 Biobase_2.44.0
## [11] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [13] IRanges_2.18.3 S4Vectors_0.22.1
## [15] BiocGenerics_0.30.0 ClusterR_1.2.1
## [17] gtools_3.8.2 expss_0.10.2
## [19] rstatix_0.6.0 Rtsne_0.15
## [21] ggpubr_0.2.5 magrittr_2.0.1
## [23] GGally_1.5.0 gridExtra_2.3
## [25] viridis_0.5.1 viridisLite_0.3.0
## [27] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [29] plotly_4.9.2 glue_1.4.2
## [31] reshape2_1.4.4 forcats_0.5.0
## [33] stringr_1.4.0 dplyr_1.0.1
## [35] purrr_0.3.4 readr_1.3.1
## [37] tidyr_1.1.0 tibble_3.0.3
## [39] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] gmp_0.6-1 crosstalk_1.1.0.1 digest_0.6.27
## [10] htmltools_0.5.1.1 fansi_0.4.1 checkmate_2.0.0
## [13] memoise_1.1.0 cluster_2.1.0 openxlsx_4.1.4
## [16] annotate_1.62.0 modelr_0.1.6 prettyunits_1.1.1
## [19] jpeg_0.1-8.1 colorspace_1.4-1 blob_1.2.1
## [22] rvest_0.3.5 haven_2.2.0 xfun_0.22
## [25] crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.7.2
## [28] genefilter_1.66.0 survival_3.2-7 gtable_0.3.0
## [31] zlibbioc_1.30.0 XVector_0.24.0 car_3.0-7
## [34] abind_1.4-5 scales_1.1.1 DBI_1.1.0
## [37] Rcpp_1.0.6 xtable_1.8-4 progress_1.2.2
## [40] htmlTable_1.13.3 foreign_0.8-76 bit_4.0.4
## [43] Formula_1.2-3 htmlwidgets_1.5.1 httr_1.4.2
## [46] acepack_1.4.1 ellipsis_0.3.1 farver_2.0.3
## [49] pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.3
## [52] nnet_7.3-14 dbplyr_1.4.2 locfit_1.5-9.4
## [55] labeling_0.3 tidyselect_1.1.0 rlang_0.4.10
## [58] AnnotationDbi_1.46.1 munsell_0.5.0 cellranger_1.1.0
## [61] tools_3.6.3 cli_2.0.2 generics_0.0.2
## [64] RSQLite_2.2.0 broom_0.7.0 evaluate_0.14
## [67] yaml_2.2.1 bit64_4.0.5 fs_1.5.0
## [70] zip_2.0.4 xml2_1.2.5 compiler_3.6.3
## [73] rstudioapi_0.11 curl_4.3 png_0.1-7
## [76] ggsignif_0.6.0 reprex_0.3.0 geneplotter_1.62.0
## [79] stringi_1.5.3 highr_0.8 lattice_0.20-41
## [82] Matrix_1.2-18 vctrs_0.3.2 pillar_1.4.6
## [85] lifecycle_0.2.0 cowplot_1.0.0 data.table_1.12.8
## [88] bitops_1.0-6 R6_2.5.0 latticeExtra_0.6-29
## [91] rio_0.5.16 assertthat_0.2.1 withr_2.2.0
## [94] GenomeInfoDbData_1.2.1 hms_0.5.3 grid_3.6.3
## [97] rpart_4.1-15 rmarkdown_2.7 carData_3.0-3
## [100] lubridate_1.7.4 base64enc_0.1-3